// A Rust implemention of Google's simhash algorithm.

extern crate itertools;

use itertools::Itertools;

fn _ngram(s: &[u8], width: usize) -> Vec<&[u8]> {
    let length = s.len() - width + 1;
    (0..length).map(|i| &s[i..i + width]).collect()
}

fn _hash_token(t: &[u8], n: usize) -> i128 {
    if t.len() == 0 {
        0
    } else {
        let k: i128 = 1_000_003;
        let m: i128 = (u128::max_value() >> (128 - n)) as i128;

        let mut x: i128 = (t[0] as i128) << 7;
        for c in t.iter() {
            let (xk, _) = x.overflowing_mul(k);
            x = xk ^ (*c as i128) & m;
        }
        x ^ t.len() as i128
    }
}

fn simhash(s: &[u8], n: usize, width: usize) -> u128 {
    let masks: Vec<i128> = (0..n).map(|i| 1 << i).collect();
    let mut sponge: Vec<i128> = (0..n).map(|_| 0).collect();
    let mut ngrams = _ngram(s, width);
    ngrams.sort();

    for (t, group) in &ngrams.iter().group_by(|x| *x) {
        let w: i128 = group.map(|_| 1).sum::<i128>();
        let hashed = _hash_token(&t, n);
        for i in 0..n {
            sponge[i] += if hashed & masks[i] == 0 { -w } else { w }
        }
    }

    let mut res: u128 = 0;
    for (i, v) in sponge.iter().enumerate() {
        if *v > 0i128 {
            res |= masks[i] as u128
        }
    }
    res
}

fn main() {
    println!("{:016X}", simhash("Hello, world!".as_bytes(), 64, 8));
}
